sorting list (for the extra bonus of the homework)


In [24]:
x = ["aardvark", "bee", 'croco', 'duck', "emo"]

In [25]:
#sorted by second letter ['aardvark', 'bee', 'emu', 'crocodrile; 'duck' ]
sorted(x, reverse=True)


Out[25]:
['emo', 'duck', 'croco', 'bee', 'aardvark']

In [26]:
#sorted(x, key=???) when you want to sort by the second letter of the list.

In [27]:
def get_second_letter(s):
    return s[1]

In [28]:
get_second_letter("cheese")


Out[28]:
'h'

In [29]:
sorted(x, key=get_second_letter)


Out[29]:
['aardvark', 'bee', 'emo', 'croco', 'duck']

Lambda functions!

a way of writing a function on a single line


In [30]:
#normal function
def get_second_letter(s):
    return s[1]

In [31]:
get_second_letter = lambda s: s[1]

In [32]:
get_second_letter("hello")


Out[32]:
'e'

In [35]:
sorted(x, key = lambda s: s[1])


Out[35]:
['aardvark', 'bee', 'emo', 'croco', 'duck']

In [19]:
# [P['name'] for p in sorted(planets, hey=lambda x: x['moons'])]

In [20]:
# def get moon_count(d):
#     return d['moons']
# sorted(planets, key=get_moon_count)

In [21]:
#written in  SQL:
#Select name from planet order by moons

tuple


In [39]:
t = [5]

In [40]:
for item in t:
    print(item * item)


25

In [41]:
t.append(30)

In [42]:
carefree_list = [5, 33, 32, 66, 44]

In [43]:
carefree_list[1] = 'Mr Fluffypants'

In [45]:
carefree_list


Out[45]:
[5, 'Mr Fluffypants', 32, 66, 44]

In [48]:
t[1] = 'Mr. Fluffypants'

In [49]:
t


Out[49]:
[5, 'Mr. Fluffypants']

In [50]:
#inmutable data type
#one benefit is exactly that: cant be changed
#other benefit is that tuples are memory-effcient

In [51]:
import sys

In [52]:
hello = [1, 2, 3]

In [54]:
sys.getsizeof(hello)


Out[54]:
88

Back to regular expressions for a moment

Grouping with multiple matches in th same string


In [59]:
test = "one 1 two 2 three 3 four 4 five 5"

In [63]:
import re
re.findall(r"\w+ \d", test)


Out[63]:
['one 1', 'two 2', 'three 3', 'four 4', 'five 5']

In [64]:
for item in re.findall(r"(\w+) (\d)", test):


  File "<ipython-input-64-b4449377acf8>", line 1
    for item in re.findall(r"(\w+) (\d)", test):
                                                ^
SyntaxError: unexpected EOF while parsing

In [65]:
all_subjects = open("enronsubjects.txt").read()

In [68]:
[item[0] for item in re.findall (r"(\d{3})-(\d{3})-(\d{4})", all_subjects)]


Out[68]:
['713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '281',
 '713',
 '713',
 '713',
 '713',
 '713',
 '713',
 '281',
 '713',
 '713',
 '713',
 '614',
 '713',
 '303',
 '281',
 '800',
 '800',
 '888']

Monetary amounts in the subjects lines

match something like $10 m,k,b


In [70]:
re.findall(r"\$(\d+) ?(\w+)", all_subjects)


Out[70]:
[('22', '8'),
 ('22', '8'),
 ('10', 'M'),
 ('10', 'M'),
 ('10', 'M'),
 ('10', 'M'),
 ('6', '8'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('25', 'million'),
 ('82', '0'),
 ('82', '0'),
 ('40', 'Million'),
 ('27', 'Billion'),
 ('27', 'Billion'),
 ('5', '0'),
 ('5', '0'),
 ('89', '5'),
 ('89', '5'),
 ('1', '9'),
 ('1', '9'),
 ('1', '9'),
 ('1', '9'),
 ('870', 'K'),
 ('870', 'K'),
 ('14', '1'),
 ('14', '1'),
 ('21', 'billion'),
 ('6', 'million'),
 ('14', 'bln'),
 ('14', 'bln'),
 ('100', 'PRICE'),
 ('250', 'Cap'),
 ('350', 'MM'),
 ('1', '2'),
 ('1', '2'),
 ('1', '2'),
 ('1', '2'),
 ('10', 'Three'),
 ('70', '0'),
 ('70', '0'),
 ('70', '0'),
 ('10', 'you'),
 ('10', 'you'),
 ('13', 'B'),
 ('13', 'B'),
 ('100', 'on'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('500', 'k'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('97', '1'),
 ('1', 'Billion'),
 ('1', 'Billion'),
 ('39', 'in'),
 ('39', 'in'),
 ('1', '0'),
 ('1', '0'),
 ('14', '9'),
 ('5', '0'),
 ('5', '0'),
 ('5', '0'),
 ('2', '1'),
 ('21', 'P'),
 ('550', 'Million'),
 ('455', 'Million'),
 ('5', 'million'),
 ('5', 'million'),
 ('5', 'million'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('100', 'Price'),
 ('2', '0'),
 ('2', '0'),
 ('2', '0'),
 ('2', '0'),
 ('10', '0'),
 ('10', '0'),
 ('10', '0'),
 ('2', '0'),
 ('2', '0'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('2', '9'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('160', '0'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('2', 'Billion'),
 ('6', '7'),
 ('100', 'mil'),
 ('50', 'per'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('21', '2'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('19', '5'),
 ('4', '2'),
 ('4', '2'),
 ('4', '2'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('7', 'MM'),
 ('1', '0'),
 ('1', '0'),
 ('1', '6'),
 ('1', '6'),
 ('1', '6'),
 ('8', 'Million'),
 ('8', 'Million'),
 ('500', 'mm'),
 ('500', 'mm'),
 ('500', 'mm'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('80', 'million'),
 ('50', 'M'),
 ('102', 'Target'),
 ('102', 'Target'),
 ('20', '0'),
 ('5', '0'),
 ('25', 'Million'),
 ('25', 'Million'),
 ('25', 'Million'),
 ('120', 'EXTRA'),
 ('120', 'EXTRA'),
 ('45', 'Million'),
 ('45', 'Million'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('600', 'B'),
 ('600', 'B'),
 ('14', '7'),
 ('14', '7'),
 ('14', '7'),
 ('24', '0'),
 ('24', '0'),
 ('2', '2'),
 ('2', '2'),
 ('2', '2'),
 ('100', 'k'),
 ('7', '7'),
 ('18', '3'),
 ('130', 'Million'),
 ('130', 'Million'),
 ('130', 'Million'),
 ('1', 'mm'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('1', '0'),
 ('128', 'Return'),
 ('128', 'Return')]

In [73]:
vals= []
for item in re.findall(r"\$(\d+) ?([mMbBkK])", all_subjects):
    multiplier = item[1].lower()
    number_val = int(item[0])
    if multiplier == 'K':
        number_val *= 1000
    elif multiplier == 'm':
        number_val *= 1000000
    elif multiplier == 'b':
        number_val *= 100000000
    vals.append(number_val)
sum(vals)


Out[73]:
139151006340

substitution with regular expressions


In [74]:
message = "this is a test, this is only a test"

In [76]:
message.replace("this", "that").replace("text", "walrus")


Out[76]:
'that is a test, that is only a test'

In [78]:
message = "This is a test, this is only a test"
re.sub(r"[Tt]his", "that", message)


Out[78]:
'that is a test, that is only a test'

In [80]:
re.sub(r"\b\w+\b", "PIKACHU", message)


Out[80]:
'PIKACHU PIKACHU PIKACHU PIKACHU, PIKACHU PIKACHU PIKACHU PIKACHU PIKACHU'

In [85]:
anon = re.sub(r"(\d{3})-(\d{3})-(\d{4})", r"\1-\2-XXXX", all_subjects)

In [89]:
re.findall(r"\d{3}-\d{3}-X{4}.{,20}", anon)


Out[89]:
['713-853-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '713-222-XXXX',
 '281-296-XXXX',
 '713-851-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '713-345-XXXX',
 '281-367-XXXX or',
 '713-528-XXXX',
 '713-850-XXXXw/713-703-XXXXc and ',
 '614-888-XXXX',
 '713-767-XXXX re Debbie Chance',
 '303-571-XXXX',
 '281-537-XXXX (home)',
 '800-937-XXXX,',
 '800-937-XXXX and ask for the Jul',
 '888-296-XXXX, HC:']

HTML to SQL

scrapping websites


In [111]:
from urllib.request import urlretrieve
urlretrieve("https://raw.githubusercontent.com/ledeprogram/data-and-databases/master/menupages-morningside-heights.html", "menu.html")


Out[111]:
('menu.html', <http.client.HTTPMessage at 0x109d3f400>)

In [99]:
#store:
#     *restaurant name
#     *price ($$$$$)
#     *cuisines

# every restauatrant has a `<tr>` that is a child of a the `<table>` tag with class `search-results`
# restaurant are in <td> tag with class= `name-address`
# restaurant names are un <a> tag inside that <td>
# restaurant price in a `span` insude an `<td> with a class `price`
# the cuisine of the restaurant is in a `<td> tag iwth no class, the fifth `<td> tag that is a child of a the restaurants `<tr>`

#target:
 
*list of diccionaties

[ 
  {'name: "Brads", price: 1, Cuisines: [coffee]},
  {}'name': "Cafe Nana", 'price'
    ]

In [100]:
# syntax: urlretreive(url, filename)

In [113]:
from bs4 import BeautifulSoup

In [120]:
raw_html = open("menu.html").read()
soup = BeautifulSoup(raw_html, "html.parser")

In [121]:
#Just the names

In [122]:
search_table = soup.find("table", {'class': 'search-results'})
table_body = search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
    name_adress_tag = tr_tag.find('td', {'class': 'name-adress'})
    a_tag = name_adress_tag.find('a')
    print(tr_tag)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-122-957540aeafed> in <module>()
      3 for tr_tag in table_body.find_all('tr'):
      4     name_adress_tag = tr_tag.find('td', {'class': 'name-adress'})
----> 5     a_tag = name_adress_tag.find('a')
      6     print(tr_tag)

AttributeError: 'NoneType' object has no attribute 'find'

In [ ]:
#how about names and prices? and maybe the cuisine too

In [ ]:


In [123]:
search_table = sopu.find("table", {'class': 'search-results'})
table_body = search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
    # get restaurant name from the inside a td
    # restaurant name = get_name(tr_tag)
    name_adress_tag = tre_tag.find('td', {'class': 'name-adress'})
    a_tag = name_adress_tag.find('a')
    restaurant_name = a_tag.string
    # get the price from the span if present
    price_tag = tr.tag.find('td', {'class': 'price'})
    price_span_tag = price_tag.fid('span')
    if price_pan_tag:
    price = price_span_tag.string
    else:
        price = 0
    print(restaurant_name, price)


  File "<ipython-input-123-0a472f49bd4f>", line 13
    price = price_span_tag.string
        ^
IndentationError: expected an indented block

In [ ]:
# much organized code using functions

In [124]:
def get_name(tr_tag):
    name_adress_tag = tre_tag.find('td', {'class': 'name-adress'})
    a_tag = name_adress_tag.find('a')
    restaurant_name = a_tag.string
    return restaurant_name
def get_price(tr_tag):
    price_tag = tr.tag.find('td', {'class': 'price'})
    price_span_tag = price_tag.fid('span')
    if price_pan_tag:
    price = price_span_tag.string
    else:
        price = 0
    return price 
def get_cuisines(tr_tag):
    all_td_tag = tr_tag.find_all('td')
    cuisine_tag = all_td_tags[4]
    cuisines = int(cuisining_tag.string)
    if cuisines:
        cuisines_list = cuisines.split(", ")
    else:
        cuisines_list = []
    return cuisines_list


  File "<ipython-input-124-fc415126fe80>", line 10
    price = price_span_tag.string
        ^
IndentationError: expected an indented block

In [125]:
restaurants = []
search_table = sopu.find("table", {'class': 'search-results'})
table_body = search_table.find('tbody')
for tr_tag in table_body.find_all('tr'): 
    restaurant_name = get_name(tr_tag)
    price = get_price(tr_tag)
    cuisines = get_cuisines(tr_tag)
    rest_dict = {'name': restaurant_name, 'price': price, 'cuisines': cuisines }
    restaurants.append(rest_dict)
restaurants


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-125-de43f4e64670> in <module>()
      1 restaurants = []
----> 2 search_table = sopu.find("table", {'class': 'search-results'})
      3 table_body = search_table.find('tbody')
      4 for tr_tag in table_body.find_all('tr'):
      5     restaurant_name = get_name(tr_tag)

NameError: name 'sopu' is not defined

In [126]:
# we want a list of str with get_cousines as a function

In [127]:
import


  File "<ipython-input-127-451c6f6f942e>", line 1
    import
          ^
SyntaxError: invalid syntax

In [ ]: